using Dates
using MultivariateStats
using Plots
using NCDatasets
using StatsBase
using Unitful
using DataFrames
using Distributions
using StatsPlots
using Turing
using DynamicHMCProject 01
Ensemble Q-Q and GLM
1 Exectuive Summary
In this project, my objective was to model hourly precipitation at a single location given the total precipitation of that day. There are many ways to approach this problem. I opted to explore the capability of generalized linear models. Initially, I created a GLM that related the hourly temperature to the hourly precipitation. Then I created a GLM that related the hourly temperature and the hourly change in temperature to the hourly precipitation. The precipitation could then be estimated provided there is data on temperature. If there are concerns about the correlational support between temperature and precipitation, they are valid, because model performance was laughably bad. Quantile-Quanitle plots between training data and model output showed no clear trends. However, if daily total precipitation were used as a limit for model output, Q-Q correlation may improve by about a factor of 10.
1.1 GLM Specifics
I assumed that the distribution of hourly precipitation for annual observation period could be described by a log-normal curve. Given this assumption, the intensity of hourly precipitation can be described using two paramters, \(\mu\) the mean and \(\sigma\) is the standard deviation. I estimated these parameters using these equations.
\[\mu_i = \alpha+\beta x_i \] {#eq-a}
\[\sigma_i = | \gamma+\chi x_i |\] {#eq-b}
Note that I have chosen to use the identity link function for equation ?@eq-a and a non-cononical absolute value for ?@eq-b. The idenity link function was chosen as \(\mu \in (-\infty,\infty)\) and, the absolute value function was chosen as \(\sigma \in (0,\infty)\).
1.2 Using Observed Daily Total Precipitation as a Boundary Condition
If I sampled from a distribution for \(y_i(x_i |\theta)\) twentyfour times, the sum would not be guarranteed to stay within the observed daily precipitation for that time period. To prevent overestimating daily precipitation, I reject any sample of the distribution of \(y_i(x_i|\theta)\) that put the total daily precipitation above the corresponding observation.
2 Data Import
If you can see the following heatmaps, that means the data was succesfully imported. Since I am only downscaling temporally, I do not need the entire grid. I have choosen to use the time series at -84 degrees east and 33 degrees north.
a = heatmap(t2m_vals[:,:,:1])
b = heatmap(tp_vals[:,:,1])
c = heatmap(cpc_vals[:,:,1])
plot(a,b,c)3 Data Wrangling and Exploration
3.1 Distribution of Hourly Temperature and Precipitation (all ERA5)
# Plot Distribution of Hourly Temperature and RainFall (all ERA5)
filter = df_era_hr[!,:precip] .> 0.0u"mm"
a = histogram(df_era_hr[filter,:precip],normalize = :pdf)
b = histogram(df_era_hr[filter,:temp],normalize = :pdf)
c = scatter(df_era_hr[filter,:temp],df_era_hr[filter,:precip]) # no change in shapes when low values are cut. I don't think theres a strong correlation.
display(cor(df_era_hr.temp[filter]./u"K",df_era_hr.precip[filter]./u"mm"))
plot(a,b,c)-0.10028165103066897
# Plot Distribution of Daily Precipitation from ERA5 and CPC obvervations
a = histogram(df_d[!,:precip_sum_era],title = "Era")
b = histogram(df_d[!,:precip_cpc], title = "CPC")
c= scatter(df_d[!,:precip_cpc],df_d[!,:precip_sum_era],title = "Q-Q") # absolute garbage
plot(a,b,c)
3.2 Distribution Fitting
function plot_dist(dist; name="", xlims=missing)
ub = quantile(dist, 0.998)
lb = quantile(dist, 0.002)
p = plot(x -> pdf(dist, x); ylabel="Probability Density", label=name, xlims=(lb, ub))
!ismissing(xlims) && xlims!(p, xlims)
return p
end
LogNormal vs Pareto
fitted = fit(LogNormal,(df_era_hr[df_era_hr.precip .> 0.0u"mm",:precip]./u"mm"))
histogram(df_era_hr[df_era_hr.precip .> 0.0u"mm",:precip]./u"mm",normalize = true,xlims=(0,2))
plot!(fitted,xlims=(0,2))
4 Methods
- Q-Q bias correction for ERA5 Precipitation and Precipitation
- Fit a distribution to Temperature
- Fit a distribution to ERA5 Precipitation
- Write a function that links Temperature distribution to ERA5 Preciptation distribution
function link_function1(value)
return value
end
function link_function2(value)
return abs(value)
end
@model function regression(y::AbstractVector, x::AbstractVector)
α ~ Normal(0, 1)
β ~ Normal(0, 1)
γ ~ Normal(0, 1)
χ ~ Normal(0, 1)
for i in eachindex(y)
μ = link_function1(α + β * x[i])
σ = link_function2(γ + χ * x[i])
y[i] ~ LogNormal(μ,σ)
end
endregression (generic function with 2 methods)
X = df_era_hr.temp[df_era_hr.precip .> 0.0u"mm"]./u"K"
y = df_era_hr.precip[df_era_hr.precip .> 0.0u"mm"]./u"mm"
logistic_chn = let
model = regression( y , X )
sampler = externalsampler(DynamicHMC.NUTS())
nsamples = 100
sample(model, sampler, nsamples; drop_warmup=true)
end
plot(logistic_chn)Sampling: 2%|▉ | ETA: 0:00:12Sampling: 7%|██▉ | ETA: 0:00:20Sampling: 9%|███▊ | ETA: 0:00:18Sampling: 13%|█████▍ | ETA: 0:00:14Sampling: 14%|█████▊ | ETA: 0:00:15Sampling: 15%|██████▏ | ETA: 0:00:14Sampling: 19%|███████▊ | ETA: 0:00:11Sampling: 22%|█████████ | ETA: 0:00:11Sampling: 23%|█████████▍ | ETA: 0:00:12Sampling: 43%|█████████████████▋ | ETA: 0:00:08Sampling: 47%|███████████████████▎ | ETA: 0:00:07Sampling: 49%|████████████████████▏ | ETA: 0:00:07Sampling: 51%|████████████████████▉ | ETA: 0:00:06Sampling: 59%|████████████████████████▎ | ETA: 0:00:06Sampling: 60%|████████████████████████▋ | ETA: 0:00:05Sampling: 61%|█████████████████████████ | ETA: 0:00:05Sampling: 63%|█████████████████████████▉ | ETA: 0:00:05Sampling: 64%|██████████████████████████▎ | ETA: 0:00:05Sampling: 66%|███████████████████████████ | ETA: 0:00:05Sampling: 67%|███████████████████████████▌ | ETA: 0:00:05Sampling: 68%|███████████████████████████▉ | ETA: 0:00:04Sampling: 69%|████████████████████████████▎ | ETA: 0:00:04Sampling: 73%|█████████████████████████████▉ | ETA: 0:00:04Sampling: 75%|██████████████████████████████▊ | ETA: 0:00:03Sampling: 76%|███████████████████████████████▏ | ETA: 0:00:03Sampling: 77%|███████████████████████████████▋ | ETA: 0:00:03Sampling: 78%|████████████████████████████████ | ETA: 0:00:03Sampling: 88%|████████████████████████████████████▏ | ETA: 0:00:02Sampling: 90%|████████████████████████████████████▉ | ETA: 0:00:01Sampling: 91%|█████████████████████████████████████▎ | ETA: 0:00:01Sampling: 92%|█████████████████████████████████████▊ | ETA: 0:00:01Sampling: 100%|█████████████████████████████████████████| Time: 0:00:13
function predict_precip_one(chn,T_i, R_n, RD)
elapsed_precip = sum(R_n)
max_precip = RD
α = mean(chn[:α])
β = mean(chn[:β])
γ = mean(chn[:γ])
χ = mean(chn[:χ])
μ = link_function1(α+β*T_i)
#print(size(μ))
σ = link_function2(γ+χ*T_i)
#print(size(σ))
dist = LogNormal(μ,σ)
R_i = [Inf64]
LL_i = 0.0
#print(typeof(R_i))
#print(typeof(max_precip))
#print(typeof(elapsed_precip))
counter = 0
#println("maxL", max_precip)
if max_precip == 0.0
R_i[1] = 0.0
#println("nob")
else
#println("coooob")
while ((R_i[1] > (max_precip-elapsed_precip)) | (R_i == Inf))
counter = counter + 1
if (max_precip - elapsed_precip) < 0.0
R_i[1] = 0.0
LL_i = 0.0
break
end
R_i[1] = rand(dist,1)[1]
LL_i = logpdf(dist,R_i[1])
if (R_i[1]+elapsed_precip) > max_precip
R_i[1] = Inf
LL_i = -Inf
end
if counter > 100
R_i[1]= 0.0
LL_i = 0.0
break
end
end
end
return R_i[1],LL_i
endpredict_precip_one (generic function with 1 method)
Output of this section should be parameters to obtain the distribution of hourly rainfall given an hourly 2meter-temperature.
4.1 Testing
df_era_hr[!,:model] = df_era_hr.precip.*0.0
df_era_hr[!,:loglikelihood] = df_era_hr.precip ./u"mm" .*0.0
for (a_idx,a) in enumerate(eachrow(df_d[:,:]))
offset_start = (a_idx-1)*24+1
offset_end = (a_idx)*24
for (idx,b) in enumerate(eachrow(df_era_hr[offset_start:offset_end ,:]))
if idx != 1
R_n = df_era_hr[offset_start:offset_end,:model]./u"mm"
else
R_n = [0.0]
end
R_i,LL_i = predict_precip_one(logistic_chn,b[:temp]./u"K",R_n,a[:precip_sum_era]./u"mm")
#println("R_i ", R_i )
#println("Idx ", idx)
#println("max_precip",a[:precip_sum_era])
df_era_hr[(idx-1)+offset_start,:model] = R_i * u"mm"
df_era_hr[(idx-1)+offset_start,:loglikelihood] = LL_i[1]
end
end
println(sum(df_era_hr.precip[:]))
println(sum(df_era_hr.model[:]))
println(sum(df_era_hr.loglikelihood))
println(cor(df_era_hr.precip./u"mm",df_era_hr.model./u"mm"))
a = plot(df_era_hr.precip[:])
plot!(df_era_hr.model[:])
b = scatter(df_era_hr.precip, df_era_hr.model)
plot(a,b)1144.4259717656432 mm
634.2373505690837 mm
8708.80472502395
0.1545087003431103
reconstruct_daily_precip = combine(groupby(df_era_hr,:time), :precip => sum, :model => sum, :loglikelihood => sum)
a = plot(reconstruct_daily_precip.precip_sum)
plot!(reconstruct_daily_precip.model_sum)
b = scatter(reconstruct_daily_precip.precip_sum,reconstruct_daily_precip.model_sum)
plot(a,b)